# Alexander F. Gazmararian
# agazmararian@gmail.com

library(tidyverse)
library(tidylog)
library(here)
library(readxl)
library(janitor)

output_file <- here("data", "inter", "bgm_processed.rds")
cache_file <- here("data", "cache", "bgm_processed.rds")

# Check for cached version first
if (file.exists(cache_file)) {
  message("[OK] Using cached BGM data from data/cache/")
  file.copy(cache_file, output_file, overwrite = TRUE)
  message("Copied cached BGM data to: ", output_file)
} else {
  # Need to process from restricted extdata
  message("Processing BGM data from restricted extdata...")
  
  extdata_file <- here("extdata", "bgm_turner", "The-Big-Green-Machine Dataset.xlsx")
  if (!file.exists(extdata_file)) {
    stop("BGM data file not found: ", extdata_file, "\n",
         "This is restricted data. Please either:\n",
         "1. Obtain the data from https://sites.google.com/view/biggreenmanufacturing\n",
         "2. Use the cached processed version in data/cache/")
  }

  # Load supply chain data
  supdat <- read_xlsx(extdata_file, sheet = "Archived Dataset-4-19-25", progress = FALSE)
  supdat <- clean_names(supdat)
  
  supdat <- supdat %>%
    filter(country == "USA") %>%
    filter(!project_announcement_date %in% c("n.d.", "N.D."))
  
  # All numeric dates should be treated as Excel serial dates
  # Excel uses different encoding - small numbers vs large numbers both use serial format
  excel_serial_dates <- supdat %>%
    filter(!is.na(project_announcement_date)) %>%
    filter(project_announcement_date != "") %>%
    # All numeric values are Excel serial dates
    mutate(
      numeric_date = suppressWarnings(as.numeric(project_announcement_date)),
      date = case_when(
        # For values that look like years (1900-2030), treat as literal years
        !is.na(numeric_date) & numeric_date >= 1900 & numeric_date <= 2030 & numeric_date == round(numeric_date) ~ 
          as.Date(paste0(numeric_date, "-01-01")),
        # For all other numeric values, treat as Excel serial dates
        !is.na(numeric_date) ~ 
          as.Date(numeric_date, origin = "1899-12-30"),
        # For non-numeric values, set to NA
        TRUE ~ NA_Date_
      ),
      # Convert project_announcement_date to readable format matching the date
      project_announcement_date_readable = as.character(date),
      # Keep original for reference but rename it
      project_announcement_date_original = project_announcement_date,
      # Replace the project_announcement_date with readable version
      project_announcement_date = project_announcement_date_readable
    ) %>%
    select(-numeric_date, -project_announcement_date_readable)
  
  # Handle any remaining entries (should be none now)
  remaining_entries <- supdat %>%
    filter(is.na(project_announcement_date) | project_announcement_date == "") %>%
    mutate(date = NA_Date_)
  
  # === DATE VALIDATION ===
  message("=== Date Processing Validation ===")
  
  # Check total counts
  total_original <- nrow(supdat)
  total_processed <- nrow(excel_serial_dates) + nrow(remaining_entries)
  message(sprintf("Original entries: %d", total_original))
  message(sprintf("Processed entries: %d", total_processed))
  message(sprintf("Entries with valid dates: %d", sum(!is.na(excel_serial_dates$date))))
  message(sprintf("Entries with NA dates: %d", sum(is.na(excel_serial_dates$date)) + nrow(remaining_entries)))
  
  if(total_original != total_processed) {
    stop("ERROR: Lost entries during processing!")
  }
  
  # Validate date ranges
  valid_dates <- excel_serial_dates$date[!is.na(excel_serial_dates$date)]
  if(length(valid_dates) > 0) {
    min_date <- min(valid_dates)
    max_date <- max(valid_dates)
    message(sprintf("Date range: %s to %s", min_date, max_date))
    
    # Check for unreasonable dates
    unreasonable_early <- sum(valid_dates < as.Date("1960-01-01"))
    unreasonable_late <- sum(valid_dates > as.Date("2030-12-31"))
    
    if(unreasonable_early > 0) {
      warning(sprintf("Found %d dates before 1960 - may indicate processing error", unreasonable_early))
      early_examples <- excel_serial_dates %>% 
        filter(!is.na(date) & date < as.Date("1960-01-01")) %>%
        select(site_id, project_announcement_date, date) %>%
        head(5)
      print(early_examples)
    }
    
    if(unreasonable_late > 0) {
      warning(sprintf("Found %d dates after 2030 - may indicate processing error", unreasonable_late))
      late_examples <- excel_serial_dates %>% 
        filter(!is.na(date) & date > as.Date("2030-12-31")) %>%
        select(site_id, project_announcement_date, date) %>%
        head(5)
      print(late_examples)
    }
    
    message("[OK] Date range validation passed")
  }
  
  # Validate specific conversion examples
  message("=== Testing Specific Conversions ===")
  test_conversions <- tibble(
    input = c("1965", "2006", "2017", "45049", "30529", "35431"),
    expected_year = c(1965, 2006, 2017, 2023, 1983, 1997),
    description = c("Early year", "Mid year", "Recent year", "Recent serial", "Early serial", "Mid serial")
  )
  
  for(i in seq_len(nrow(test_conversions))) {
    input_val <- test_conversions$input[i]
    expected_year <- test_conversions$expected_year[i]
    desc <- test_conversions$description[i]
    
    processed_row <- excel_serial_dates %>% 
      filter(project_announcement_date_original == input_val) %>% 
      slice(1)
    
    if(nrow(processed_row) > 0) {
      actual_year <- year(processed_row$date)
      if(actual_year == expected_year) {
        message(sprintf("[OK] %s: %s -> %s (year %d)", desc, input_val, processed_row$date, actual_year))
      } else {
        warning(sprintf("[FAIL] %s: %s -> %s (year %d, expected %d)", desc, input_val, processed_row$date, actual_year, expected_year))
      }
    } else {
      message(sprintf("- %s: %s not found in data", desc, input_val))
    }
  }
  
  # Check for failed conversions
  failed_conversions <- excel_serial_dates %>%
    filter(is.na(date)) %>%
    select(site_id, project_announcement_date_original, project_announcement_date) %>%
    head(10)
  
  if(nrow(failed_conversions) > 0) {
    warning(sprintf("Found %d failed date conversions:", nrow(failed_conversions)))
    print(failed_conversions)
  } else {
    message("[OK] No failed date conversions")
  }
  
  message("=== Date Validation Complete ===")
  message("")
  
  # === FILTER PROJECTS BY DATE ===
  message("=== Filtering Projects by Announcement Date ===")
  
  # Filter out projects announced before 2022
  pre_filter_count <- nrow(excel_serial_dates) + nrow(remaining_entries)
  excel_serial_dates_filtered <- excel_serial_dates %>%
    filter(is.na(date) | year(date) >= 2022)
  remaining_entries_filtered <- remaining_entries %>%
    filter(is.na(date) | year(date) >= 2022)
  
  post_filter_count <- nrow(excel_serial_dates_filtered) + nrow(remaining_entries_filtered)
  filtered_out_count <- pre_filter_count - post_filter_count
  
  message(sprintf("Projects before filtering: %d", pre_filter_count))
  message(sprintf("Projects after filtering (2022+): %d", post_filter_count))
  message(sprintf("Projects filtered out (pre-2022): %d", filtered_out_count))
  
  if(filtered_out_count > 0) {
    filtered_examples <- excel_serial_dates %>%
      filter(!is.na(date) & year(date) < 2022) %>%
      select(site_id, project_announcement_date_original, project_announcement_date, date) %>%
      arrange(date) %>%
      head(10)
    
    if(nrow(filtered_examples) > 0) {
      message("Examples of filtered projects:")
      for(i in seq_len(min(5, nrow(filtered_examples)))) {
        message(sprintf("  Site %s: %s (%s) → %s", 
                        filtered_examples$site_id[i], 
                        filtered_examples$project_announcement_date_original[i],
                        filtered_examples$project_announcement_date[i], 
                        filtered_examples$date[i]))
      }
    }
  }
  
  message("[OK] Date filtering complete")
  message("")
  
  # Update variables for downstream processing
  excel_serial_dates <- excel_serial_dates_filtered
  remaining_entries <- remaining_entries_filtered
  
  # Combine and process coordinates
  supdat.processed <- bind_rows(excel_serial_dates, remaining_entries) %>%
    # Clean coordinate columns before numeric conversion
    mutate(
      longitude = case_when(
        longitude %in% c("TBD", "NA") ~ NA_character_,
        TRUE ~ longitude
      ),
      latitude = case_when(
        latitude %in% c("TBD", "NA") ~ NA_character_,
        TRUE ~ latitude
      )
    ) %>%
    # Convert to numeric (suppressing expected warnings for cleaner output)
    mutate(across(c(longitude, latitude), ~ suppressWarnings(as.numeric(.x))))
  
  # === FINAL VALIDATION ===
  message("=== Final Data Quality Check ===")
  
  final_total <- nrow(supdat.processed)
  final_valid_dates <- sum(!is.na(supdat.processed$date))
  final_na_dates <- sum(is.na(supdat.processed$date))
  
  message(sprintf("Final dataset: %d total projects", final_total))
  message(sprintf("- %d projects with valid dates (%.1f%%)", final_valid_dates, (final_valid_dates/final_total)*100))
  message(sprintf("- %d projects with NA dates (%.1f%%)", final_na_dates, (final_na_dates/final_total)*100))
  
  # Check coordinate completeness
  valid_coords <- sum(!is.na(supdat.processed$longitude) & !is.na(supdat.processed$latitude))
  missing_coords <- sum(is.na(supdat.processed$longitude) | is.na(supdat.processed$latitude))
  message(sprintf("- %d projects with coordinates (%.1f%%)", valid_coords, (valid_coords/final_total)*100))
  message(sprintf("- %d projects missing coordinates (%.1f%%)", missing_coords, (missing_coords/final_total)*100))
  
  # Date distribution by year
  if(final_valid_dates > 0) {
    date_summary <- supdat.processed %>%
      filter(!is.na(date)) %>%
      mutate(year = year(date)) %>%
      count(year, name = "projects") %>%
      arrange(year)
    
    message(sprintf("Date distribution spans %d years (%d to %d):", 
                    nrow(date_summary), min(date_summary$year), max(date_summary$year)))
    
    # Show top 5 years by project count
    top_years <- date_summary %>% slice_max(projects, n = 5)
    for(i in seq_len(nrow(top_years))) {
      message(sprintf("  %d: %d projects", top_years$year[i], top_years$projects[i]))
    }
  }
  
  # Validation summary
  message(sprintf("Data processing summary:"))
  message(sprintf("- Original projects: %d", total_original))
  message(sprintf("- Projects after date filtering (2022+): %d", final_total))
  message(sprintf("- Projects filtered out (pre-2022): %d", total_original - post_filter_count))
  
  if(final_total == post_filter_count && 
     final_valid_dates > 0 && 
     all(!is.na(supdat.processed$date[!is.na(supdat.processed$project_announcement_date)]))) {
    message("[OK] All validations passed - data quality confirmed")
  } else {
    warning("[WARN] Some validation checks failed - review data quality")
  }
  
  message("=== Validation Complete ===")
  
  saveRDS(supdat.processed, output_file)
  message("[OK] Saved BGM data with validated dates to: ", output_file)
  
  dir.create(dirname(cache_file), recursive = TRUE, showWarnings = FALSE)
  file.copy(output_file, cache_file, overwrite = TRUE)
  message("[OK] Cached BGM data to data/cache/")
}  # End of else block for processing from extdata